#Import libraries
import time
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
import logging
#from skbio.stats.composition import clr
import matplotlib.pyplot as plt
import matplotlib.axes as axes
#For CLR of ADTs
import scipy
import scipy.stats
from sklearn.preprocessing import scale
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
prostate_info = pd.read_csv("/home/unix/sjohri/valab_sjohri/projects/ex-vivo/data/final_patient_metadata.csv") # metadata table
prostate_info.tail()
adata = [None]*prostate_info.shape[0]
# adata = [None]*3
mtxs = prostate_info['CCPM Sample ID'].tolist()
mtxs = ["DFCI_" + x for x in mtxs]
meta_info = prostate_info.to_dict()
for index in range(0,len(mtxs)):
## read in 10x matrix files
print("Reading in file: {}".format(mtxs[index]))
adata[index] = sc.read_csv("/home/unix/sjohri/valab_sjohri/projects/ex-vivo/scanpy/contam_free_data/"+mtxs[index]+"_contam_free.csv").T
adata[index].var_names_make_unique()
# calculate percent mito and ribo
adata[index].var['mt'] = adata[index].var_names.str.startswith('MT-')
adata[index].var['ribo'] = adata[index].var_names.str.startswith(('RPS',"RPL"))
sc.pp.calculate_qc_metrics(adata[index], qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata[index], ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo'], jitter=0.4, multi_panel=True)
adata
count_percentiles = dict()
for i in range(0, len(adata)):
RNA_adata = adata[i]
count_cells = RNA_adata.obs['total_counts'].tolist()
sample = mtxs[i]
intervals = []
for x in range(0, 105, 5):
p = np.percentile(count_cells, x)
intervals.append(p)
count_percentiles[sample] = intervals
count_percentiles = pd.DataFrame(count_percentiles)
count_percentiles.index = range(0,105, 5)
count_percentiles.mean(axis = 1)
#Initialize dataframe to store cell counts
no_wells = len(adata)
height = 4
width = no_wells
contamination_df = pd.DataFrame(0, index=range(height), columns=range(width))
contamination_df.rename(index={0:'Cells',1:'Platelets',2:'RBCs',3:'% cell with < 50% mito'}, inplace=True)
contamination_df.columns = mtxs
for adata_index in range(len(mtxs)):
RNA_adata = adata[adata_index]
well = mtxs[adata_index]
print(well)
#Remove platelets
original_cell_no = RNA_adata.shape[0]
print('Annotating platelet contamination and Megakaryocytes.')
mat = csr_matrix(RNA_adata.X)
mat = mat[:, RNA_adata.var_names.isin(['PF4'])].todense()
if mat.shape[1]!=0:
platelets = RNA_adata[np.ravel(mat > 0)].obs.index
no_platelets = len(platelets)
platelet_info = [False] * original_cell_no
index = 0
for x in RNA_adata.obs.index:
if x in platelets:
platelet_info[index] = True
index += 1
RNA_adata.obs['platelet'] = platelet_info
else:
print("No platelets detected")
no_platelets = 0
RNA_adata.obs['platelet'] = [False] * original_cell_no
print('Annotating RBCs')
mat = csr_matrix(RNA_adata.X)
mat = mat[:, RNA_adata.var_names.isin(['HBB','HBA'])].todense()
if mat.shape[1]!=0:
RBCs = RNA_adata[np.ravel(mat > 1)].obs.index
no_RBCs = len(RBCs)
RBC_info = [False] * original_cell_no
index = 0
for x in RNA_adata.obs.index:
if x in RBCs:
RBC_info[index] = True
index += 1
RNA_adata.obs['RBC'] = RBC_info
else:
print("No RBCs detected")
no_RBCs = 0
RNA_adata.obs['RBC'] = [False] * original_cell_no
#Calculate quality metrics
cells_mito_cutoff = np.sum([RNA_adata.obs['pct_counts_mt'] < 50])/original_cell_no
contamination_df[well] = [original_cell_no, no_platelets, no_RBCs, cells_mito_cutoff]
adata[adata_index] = RNA_adata
contamination_df.T
fig,axs = plt.subplots(2,2,figsize=(15,10),)
bars = mtxs
y_pos = np.arange(len(bars))
# plt.setp(axs, xticks=y_pos, xticklabels=bars, fontsize=2, Rotation=45)
##Plot Cells
height = contamination_df.iloc[0,:]
axs[0,0].bar(y_pos, height)
axs[0,0].set_xticks(y_pos)
axs[0,0].set_title("Cells per well")
axs[0,0].set_xticklabels(bars, fontsize=10,rotation=90)
##Plot Platelets
height = contamination_df.iloc[1,:]/contamination_df.iloc[0,:]
axs[0,1].bar(y_pos, height)
axs[0,1].set_xticks(y_pos)
axs[0,1].set_title("Platelets per well")
axs[0,1].set_xticklabels(bars, fontsize=10,rotation=90)
##Plot RBCs
height = contamination_df.iloc[2,:]/contamination_df.iloc[0,:]
axs[1,0].bar(y_pos, height)
axs[1,0].set_xticks(y_pos)
axs[1,0].set_title("RBCs per well")
axs[1,0].set_xticklabels(bars, fontsize=10,rotation=90)
##Plot % mito
height = contamination_df.iloc[3,:]
axs[1,1].bar(y_pos, height)
axs[1,1].set_xticks(y_pos)
axs[1,1].set_title("% cells with < 50% mito")
axs[1,1].set_xticklabels(bars, fontsize=10,rotation=90)
# Show graphic
fig.tight_layout()
fig.show()
adata_concat = adata[0]
adata_concat = adata_concat.concatenate(adata[1:],join='inner')
adata_concat.var = adata_concat.var[["mt","ribo"]]
print(adata_concat)
sc.pp.filter_cells(adata_concat, min_genes=200)
sc.pp.filter_genes(adata_concat, min_cells=50)
# After combining all samples to one h5ad object, we can read into this single object directly.
path = "/home/unix/sjohri/valab_sjohri/projects/ex-vivo/scanpy/integrated_analysis/"
adata_concat_file = path + "all_concat.h5ad"
adata_concat.write(adata_concat_file, compression = "gzip")
percent_mito_cutoff = 50
#Filter cells by cells with % mito
before_mito = adata_concat.X.shape[0]
adata_concat = adata_concat[adata_concat.obs.pct_counts_mt < percent_mito_cutoff, :]
print("Dead cells removed: ", before_mito - adata_concat.X.shape[0])
# adata_concat = adata_concat[adata_concat.obs['RBC'] == False]
# adata_concat = adata_concat[adata_concat.obs['platelet'] == False]
#Normalize gene counts
sc.pp.normalize_total(adata_concat, target_sum=1e4)
# Logarithmize the data.
sc.pp.log1p(adata_concat)
#Store raw data, for finding markers in each cluster and other analysis
adata_concat.raw = adata_concat
#variable genes for the full dataset
sc.pp.highly_variable_genes(adata_concat, n_top_genes=5000)
sc.pl.highly_variable_genes(adata_concat)
print("Highly variable genes: %d"%sum(adata_concat.var.highly_variable))
var_genes_all = adata_concat.var.highly_variable
# Identify highly-variable genes.
sc.pp.highly_variable_genes(adata_concat, n_top_genes=5000, batch_key='batch')
sc.pl.highly_variable_genes(adata_concat)
print("Highly variable genes intersection: %d"%sum(adata_concat.var.highly_variable_intersection))
print("Number of batches where gene is variable:")
print(adata_concat.var.highly_variable_nbatches.value_counts())
var_genes_batch = adata_concat.var.highly_variable_nbatches > 0
print("Any batch var genes: %d"%sum(var_genes_batch))
print("All data var genes: %d"%sum(var_genes_all))
print("Overlap: %d"%sum(var_genes_batch & var_genes_all))
print("Variable genes in all batches: %d"%sum(adata_concat.var.highly_variable_nbatches ==3))
print("Overlap batch instersection and all: %d"%sum(var_genes_all & adata_concat.var.highly_variable_intersection))
var_select = adata_concat.var.highly_variable_nbatches > 1
var_genes = var_select.index[var_select]
len(var_genes)
#Filter to highly variable genes
adata_concat = adata_concat[:,var_genes]
sc.pp.regress_out(adata_concat, keys = ['pct_counts_mt','total_counts'])
adata_concat
sc.pp.scale(adata_concat, max_value=10)
# Set seed
intialization = 3120
sc.pp.pca(adata_concat, random_state=intialization, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_concat, n_pcs= 50, log=True, show = True)
# sc.pl.pca_overview(adata_concat,color="n_genes")
adata_concat.obsm['X_pca'].shape
adata_concat
normalized_pca = pd.DataFrame(adata_concat.obsm['X_pca'])
meta_data = adata_concat.obs
sc.external.pp.harmony_integrate(adata_concat, 'batch', max_iter_harmony=50)
adata_concat
adata_concat.obsm['X_pca'] = adata_concat.obsm['X_pca_harmony']
sc.pp.neighbors(adata_concat, random_state=intialization, n_neighbors=15, n_pcs=30)
sc.tl.louvain(adata_concat, random_state=intialization, resolution=1.0)
sc.tl.leiden(adata_concat, random_state=intialization, resolution=1.0)
sc.tl.umap(adata_concat, random_state=intialization)
#Final written out adata object:
adata_file = "/home/unix/sjohri/valab_sjohri/projects/ex-vivo/scanpy/integrated_analysis/final_obj.h5ad"
adata_concat.write(adata_file, compression = "gzip")
sc.pl.umap(adata_concat, color=['louvain'],legend_loc = 'on data', show = True)
sc.pl.umap(adata_concat, color=['louvain'], show = True)
sc.pl.umap(adata_concat, color=['leiden'],legend_loc = 'on data', show = True)
sc.pl.umap(adata_concat, color=['leiden'], show = True)
sc.pl.umap(adata_concat, color=['batch'], show = True)
leiden_prop = dict()
no_clusters = len(adata_concat.obs['leiden'].unique())
for sn in adata_concat.obs['batch'].unique():
temp = adata_concat[adata_concat.obs['batch'] == sn,:]
cell_count = temp.obs['leiden'].value_counts()
cell_count = cell_count.sort_index()
cell_count = cell_count.to_dict()
cell_count = [cell_count[str(x)] if str(x) in cell_count.keys() else 0 for x in range(0,no_clusters)]
leiden_prop[sn] = cell_count
leiden_prop = pd.DataFrame(leiden_prop).T
leiden_prop['total'] = leiden_prop.sum(axis = 1)
leiden_prct = pd.DataFrame()
for x in leiden_prop.columns:
leiden_prct[x] = leiden_prop[x]/leiden_prop[x].sum()
leiden_prct = leiden_prct.sort_index()
leiden_prct.T.plot.bar(stacked=True, legend = False,figsize=(7,7))
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol = 3)
adata_concat
sc.pl.umap(adata_concat, color=['PTPRC','CD8A','CD8B','CD3D','EPCAM','KRT18','KRT8'], show = True)
markers = pd.read_csv("/home/unix/sjohri/valab_sjohri/projects/ex-vivo/data/PanglaoDB_markers_27_Mar_2020.tsv", sep="\t")
hs_markers = markers[markers.species!="Mm"]
marker_genes_dict = {'Endothelial': [x for x in hs_markers[(hs_markers.cell_type=="Endothelial cells") & (hs_markers.canonical_marker==1)
& (hs_markers.ubiquitousness_index<=0.1) & (hs_markers.sensitivity_human>=0.75)
& (hs_markers.specificity_human<=0.25)].official_gene_symbol if x in adata_concat.raw.var_names],
'Smooth_muscle': [x for x in hs_markers[(hs_markers.cell_type=="Smooth muscle cells") & (hs_markers.canonical_marker==1)
& (hs_markers.ubiquitousness_index<=0.1) & (hs_markers.sensitivity_human>=0.75)
& (hs_markers.specificity_human<=0.25)].official_gene_symbol if x in adata_concat.raw.var_names],
'Epithelial': [x for x in hs_markers[(hs_markers.cell_type=="Epithelial cells")
& (hs_markers.ubiquitousness_index<=0.1) & (hs_markers.sensitivity_human>=0.75)
& (hs_markers.specificity_human<=0.25)].official_gene_symbol if x in adata_concat.raw.var_names],
'Fibroblasts': [x for x in hs_markers[(hs_markers.cell_type=="Fibroblasts") & (hs_markers.canonical_marker==1)
& (hs_markers.ubiquitousness_index<=0.1) & (hs_markers.sensitivity_human>=0.75)
& (hs_markers.specificity_human<=0.25)].official_gene_symbol if x in adata_concat.raw.var_names],
'Leukocytes': ['PTPRC','CD3D','MS4A1','NKG7','CD8A','GNLY','FCER1A','C1QA','C1QB','TYROBP','LYZ','AIF1'],
'Neuro': ['NRXN1','GPM6B','S100B','CRYAB']}
sc.pl.dotplot(adata_concat, marker_genes_dict,"leiden", dendrogram=True)
ax=sc.pl.correlation_matrix(adata_concat,groupby='leiden', figsize=(10,5))
sc.tl.rank_genes_groups(adata_concat, 'leiden', method='wilcoxon', corr_method="benjamini-hochberg")
sc.pl.rank_genes_groups(adata_concat, n_genes=25, sharey=False)
sc.pl.rank_genes_groups_tracksplot(adata_concat, n_genes=3)
# sc.tl.rank_genes_groups(adata_concat,'leiden', method='wilcoxon', corr_method="benjamini-hochberg")
# sc.pl.rank_genes_groups(adata_concat, n_genes=25, sharey=False)